// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Restore/Anime4K_Restore_CNN_Soft_L.glsl

//!MAGPIE EFFECT
//!VERSION 2
//!OUTPUT_WIDTH INPUT_WIDTH
//!OUTPUT_HEIGHT INPUT_HEIGHT


//!TEXTURE
Texture2D INPUT;

//!SAMPLER
//!FILTER POINT
SamplerState sam;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex2;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex3;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex4;


//!PASS 1
//!DESC Conv-4x3x3x3
//!IN INPUT
//!OUT tex1, tex2
//!BLOCK_SIZE 16
//!NUM_THREADS 64

void Pass1(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	uint i, j;

	float3 src[4][4];
	[unroll]
	for (i = 0; i <= 2; i += 2) {
		[unroll]
		for (j = 0; j <= 2; j += 2) {
			float2 tpos = (gxy + uint2(i, j)) * inputPt;
			const float4 sr = INPUT.GatherRed(sam, tpos);
			const float4 sg = INPUT.GatherGreen(sam, tpos);
			const float4 sb = INPUT.GatherBlue(sam, tpos);

			// w z
			// x y
			src[i][j] = float3(sr.w, sg.w, sb.w);
			src[i][j + 1] = float3(sr.x, sg.x, sb.x);
			src[i + 1][j] = float3(sr.z, sg.z, sb.z);
			src[i + 1][j + 1] = float3(sr.y, sg.y, sb.y);
		}
	}

	[unroll]
	for (i = 1; i <= 2; ++i) {
		[unroll]
		for (j = 1; j <= 2; ++j) {
			uint2 destPos = gxy + uint2(i - 1, j - 1);

			if (i != 1 || j != 1) {
				if (destPos.x >= inputSize.x || destPos.y >= inputSize.y) {
					continue;
				}
			}

			float4 target1 = mul(src[i - 1][j - 1], float3x4(-0.2676983, -0.1694746, 0.7231928, -0.050193843, 0.1850188, -0.4749505, 0.07632266, 0.17824799, 0.026348969, -0.213702, -0.16420218, -0.066780016));
			target1 += mul(src[i - 1][j], float3x4(-0.09888135, -0.079641104, 0.51160043, 0.53629893, -0.1368544, -0.07092336, 0.18622977, 0.6388427, 0.19499005, 0.06811229, -0.31991923, 0.088302985));
			target1 += mul(src[i - 1][j + 1], float3x4(0.06487055, -0.1591197, -0.29304126, -0.428903, 0.1966732, 0.11229865, 0.089009434, 0.23463708, -0.22231965, -0.008649182, 0.3317394, 0.10976113));
			target1 += mul(src[i][j - 1], float3x4(0.40386826, -0.09486362, -0.058931742, -0.1341693, -0.28993917, -0.09050739, 0.28094417, 0.31630108, -0.02661985, -0.24368657, 0.096867286, 0.05391612));
			target1 += mul(src[i][j], float3x4(0.05631564, 0.34576723, -0.5587978, 0.16213721, 0.12679785, 0.18991663, -0.24762277, -0.33682153, -0.22863568, 0.20517963, 0.20418519, 0.12087338));
			target1 += mul(src[i][j + 1], float3x4(-0.17579688, 0.18395603, -0.014987654, 0.30243605, -0.12778279, -0.07003458, -0.5353068, -0.39372426, 0.2676877, 0.255503, -0.29737592, -0.30513638));
			target1 += mul(src[i + 1][j - 1], float3x4(0.799834, -0.023603538, 0.19820727, -0.11204286, -0.1566225, -0.1937577, -0.030266436, -0.10107911, 0.023661222, 0.16879195, 0.046644643, 0.09485681));
			target1 += mul(src[i + 1][j], float3x4(-0.014675849, -0.110290475, -0.28381273, -0.06814732, 0.2067597, 0.20925248, -0.24068354, -0.5096708, -0.09384791, 0.10593733, 0.0672362, -0.06924161));
			target1 += mul(src[i + 1][j + 1], float3x4(0.05908883, 0.099426664, -0.20916614, -0.17044452, -0.091960385, 0.3218613, 0.41635308, -0.36125022, -0.012630896, -0.37540653, 0.018497325, -0.100674420));
			target1 += float4(-0.55533427, -0.05231614, -0.032685343, -0.027457517);

			float4 target2 = mul(src[i - 1][j - 1], float3x4(0.09844753, -0.19389127, 0.029695928, 0.3805915, 0.1353029, 0.027786473, 0.15621242, 0.09383762, -0.1097243, 0.021245124, -0.016402386, 0.09129394));
			target2 += mul(src[i - 1][j], float3x4(0.3038283, 0.03778846, 0.1898852, 0.23949303, -0.34829387, 0.20485392, 0.60560244, 0.4089768, -0.260066, 0.42611003, 0.19227165, 0.03948586));
			target2 += mul(src[i - 1][j + 1], float3x4(-0.033990905, 0.17583308, -0.2235879, 0.47376296, -0.1001787, 0.72851896, -0.056391567, -0.056544185, 0.0966166, -0.016663829, -0.15151545, -0.14227313));
			target2 += mul(src[i][j - 1], float3x4(-0.16544957, 0.05889452, -0.3277256, -0.42792717, 0.32491356, -0.39113912, 0.16600312, -0.3097514, 0.27907088, -0.22553465, 0.048548058, -0.08310438));
			target2 += mul(src[i][j], float3x4(0.03992136, 0.17895368, 0.16562924, -0.536188, -0.25868654, -0.4869832, 0.2591772, -0.5191932, 0.020162001, -0.41568524, 0.4776641, 0.019298514));
			target2 += mul(src[i][j + 1], float3x4(0.14911795, -0.5984171, -0.18241958, 0.5472136, -0.69194865, 0.033839397, 0.13408412, 0.09503547, -0.21318413, 0.53743845, 0.080091774, -0.1369053));
			target2 += mul(src[i + 1][j - 1], float3x4(-0.038978565, 0.40742934, 0.20107205, -0.3550106, 0.227634, -0.16101603, -0.45037574, 0.23192371, 0.17923234, -0.13692904, 0.10395048, 0.3124129));
			target2 += mul(src[i + 1][j], float3x4(-0.059144646, -0.22531863, -0.024704054, -0.20749553, 0.58086175, -0.32206532, -0.5130457, -0.14057957, 0.24317528, 0.088735096, -0.44098017, -0.16980846));
			target2 += mul(src[i + 1][j + 1], float3x4(-0.30321437, 0.17502202, 0.1910563, -0.10118702, 0.1465326, 0.3852395, -0.31210947, 0.18236226, -0.23306467, -0.28551704, -0.2982589, 0.072740674));
			target2 += float4(0.029685514, 0.066621915, 0.03600017, -0.03497038);

			tex1[destPos] = target1;
			tex2[destPos] = target2;
		}
	}
}


//!PASS 2
//!DESC Conv-4x3x3x16
//!IN tex1, tex2
//!OUT tex3, tex4
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass2(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	float4 a1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	float4 b1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 c1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 d1 = tex1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 e1 = tex1.SampleLevel(sam, pos, 0);
	float4 f1 = tex1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 g1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 h1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 i1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	float4 na1 = max(-a1, 0);
	float4 nb1 = max(-b1, 0);
	float4 nc1 = max(-c1, 0);
	float4 nd1 = max(-d1, 0);
	float4 ne1 = max(-e1, 0);
	float4 nf1 = max(-f1, 0);
	float4 ng1 = max(-g1, 0);
	float4 nh1 = max(-h1, 0);
	float4 ni1 = max(-i1, 0);

	a1 = max(a1, 0);
	b1 = max(b1, 0);
	c1 = max(c1, 0);
	d1 = max(d1, 0);
	e1 = max(e1, 0);
	f1 = max(f1, 0);
	g1 = max(g1, 0);
	h1 = max(h1, 0);
	i1 = max(i1, 0);

	float4 a2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	float4 b2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 c2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 d2 = tex2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 e2 = tex2.SampleLevel(sam, pos, 0);
	float4 f2 = tex2.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 g2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 h2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 i2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	float4 na2 = max(-a2, 0);
	float4 nb2 = max(-b2, 0);
	float4 nc2 = max(-c2, 0);
	float4 nd2 = max(-d2, 0);
	float4 ne2 = max(-e2, 0);
	float4 nf2 = max(-f2, 0);
	float4 ng2 = max(-g2, 0);
	float4 nh2 = max(-h2, 0);
	float4 ni2 = max(-i2, 0);

	a2 = max(a2, 0);
	b2 = max(b2, 0);
	c2 = max(c2, 0);
	d2 = max(d2, 0);
	e2 = max(e2, 0);
	f2 = max(f2, 0);
	g2 = max(g2, 0);
	h2 = max(h2, 0);
	i2 = max(i2, 0);

	float4 target1 = mul(a1, float4x4(0.20989326, 0.020975577, -0.005522964, -0.10013134, 0.013517254, -0.03347422, 0.40903455, 0.013940953, 0.01066957, 0.2569982, -0.018764338, -0.37931216, -0.20921241, -0.3565134, 0.1776639, 0.081515394));
	target1 += mul(b1, float4x4(-0.2555968, -0.05024504, 0.046776827, 0.38626888, -0.21787676, 0.0013136056, -0.13391882, 0.00022813173, -0.042842478, -0.10413157, -0.008385445, -0.11843704, 0.062092766, -0.40029097, 0.31873867, -0.19030346));
	target1 += mul(c1, float4x4(-0.19435422, -0.56006145, 0.050693467, -0.8857939, -0.0677575, -0.30498, 0.012069988, 0.026757652, -0.16890685, 0.04575225, -0.08477036, -0.018015383, 0.0002810964, 0.0772843, 0.00017034424, -0.228497));
	target1 += mul(d1, float4x4(0.092564344, 0.061863884, -0.13135873, 0.10290956, 0.1670116, -0.08312144, -0.020718448, 0.06729496, -0.06338295, -0.09972319, 0.18505506, -0.2622095, 0.045575716, 0.12836345, -0.22356766, 0.28924033));
	target1 += mul(e1, float4x4(0.23605964, 0.46831363, 0.21037713, 0.3901851, -0.122640595, -0.29213053, 0.14194407, -0.3353137, 0.07847812, 0.3094049, -0.050705243, -0.23498294, 0.24583417, 0.3703223, 0.1121086, 0.20645288));
	target1 += mul(f1, float4x4(-0.08616125, -0.13809866, 0.27488732, 0.19573413, -0.20682202, 0.01106275, -0.018731792, 0.048580807, -0.097444884, -0.03766069, -0.12636039, 0.3133589, -0.12802023, 0.1988174, 0.19551867, -0.2720954));
	target1 += mul(g1, float4x4(0.03484159, 0.05830728, 0.028816089, 0.27173883, 0.15579484, -0.07751753, -0.033748254, 0.22559631, -0.35857964, 0.1043378, 0.5031367, 0.031042032, 0.071555324, -0.24148308, -0.24156207, 0.104249395));
	target1 += mul(h1, float4x4(0.04002337, -0.17350379, -0.1802324, -0.23008482, 0.2917599, 0.1801853, 0.041955303, 0.015545025, 0.069034904, 0.19370675, 0.097300164, 0.11832116, -0.23043779, 0.33832225, -0.029885143, -0.022836795));
	target1 += mul(i1, float4x4(0.040476788, 0.3176767, -0.2372066, 0.24106048, 0.28147677, -0.06513699, -0.22784042, -0.46840426, -0.23415963, -0.067057185, -0.013863767, 0.30710638, 0.06337683, -0.1774192, 0.05082387, -0.02581459));
	target1 += mul(a2, float4x4(-0.13767451, 0.26832962, 0.018361554, 0.2665501, -0.22070843, 0.10799693, 0.09780551, 0.042999722, 0.3302224, 0.10916339, -0.22705203, 0.040675506, -0.049211837, 0.19487813, 0.051528033, 0.20227027));
	target1 += mul(b2, float4x4(0.1279485, 0.14895418, 0.40570346, -0.008809808, 0.09898892, 0.035774715, -0.28405192, 0.26836014, -0.096799396, -0.12336552, -0.24413097, 0.12693845, 0.12410443, 0.27200332, -0.18279982, -0.032115027));
	target1 += mul(c2, float4x4(0.14698029, -0.31720948, 0.24974433, 0.14444488, 0.09503049, -0.02618792, 0.15163966, 0.22923012, -0.004227005, -0.2564904, -0.06648419, -0.07868524, -0.14852846, -0.3513046, -0.1374295, -0.09808154));
	target1 += mul(d2, float4x4(0.1275583, 0.1875862, -0.15939887, -0.1029876, -0.25886494, -0.07434281, -0.018779758, -0.008408217, 0.06420735, 0.0025367932, 0.073679835, 0.1369152, -0.2256255, 0.26216295, -0.052095387, 0.04673847));
	target1 += mul(e2, float4x4(0.1147465, 0.14129257, -0.036377613, 0.041968875, 0.031286925, -0.00013609273, -0.248227, 0.10412182, -0.00039004904, 0.01673792, 0.056068443, -0.16470632, -0.042392768, 0.23993582, -0.22915693, 0.36430097));
	target1 += mul(f2, float4x4(0.23650797, 0.12200628, 0.057768486, 0.23353462, 0.04389849, -0.11567954, -0.12633252, -0.1884369, -0.10636852, -0.115114085, -0.0022040834, 0.041720822, 0.20775628, -0.1127031, -0.060805347, -0.10988217));
	target1 += mul(g2, float4x4(0.0401325, -0.271267, -0.3003843, 0.010670003, -0.12597936, -0.059235968, 0.08256807, -0.22041298, 0.14655456, 0.07416407, -0.03940599, -0.25057787, -0.043001004, 0.2124355, 0.19165096, 0.077120975));
	target1 += mul(h2, float4x4(0.01693656, -0.057261657, -0.13366276, -0.15589137, -0.07157646, -0.12266521, 0.24651442, -0.079142615, -0.113005005, -0.15769142, -0.017285366, 0.08821278, 0.28891653, 0.06013908, 0.0038421913, 0.106700204));
	target1 += mul(i2, float4x4(0.16187043, -0.059908718, -0.050456535, -0.027998367, 0.12749411, -0.07558445, 0.05249467, 0.02001542, 0.03188715, 0.056223337, 0.06117334, 0.022764465, 0.1051409, 0.0011876151, -0.07030176, -0.015487096));
	target1 += mul(na1, float4x4(0.047084607, 0.06401777, 0.15585798, 0.16639893, 0.025441, -0.020858578, -0.07795479, -0.0045188745, -0.09186016, -0.16865493, 0.02187216, 0.02241868, 0.3175809, 0.25483596, 0.046578035, -0.09617824));
	target1 += mul(nb1, float4x4(0.08622112, 0.124111585, -0.15246506, -0.072898194, 0.26907462, -0.15550381, 0.036907334, -0.16388376, -0.10869113, 0.113909826, -0.118678264, 0.013610441, -0.1307433, 0.044969033, -0.053201765, -0.058903012));
	target1 += mul(nc1, float4x4(0.036120024, -0.011461657, -0.10083318, -0.334466, 0.016460553, 0.1781498, 0.15133101, -0.0010224655, 0.10511601, 0.12667589, 0.15001541, 0.14479756, -0.046095166, -0.15012313, -0.009395591, 0.019260757));
	target1 += mul(nd1, float4x4(0.04500625, -0.037348565, -0.10475762, 0.113254204, -0.17360263, -0.18522957, 0.014305901, 0.07039716, -0.11408359, 0.057783633, -0.028000865, -0.25506407, -0.058175903, 0.0040344223, 0.11234911, -0.07254186));
	target1 += mul(ne1, float4x4(0.05607878, -0.07737156, 0.01586671, -0.21907675, 0.1729392, -0.09273287, 0.14671144, 0.21306099, -0.1374591, -0.09428349, 0.28138107, 0.08421483, -0.30330884, -0.039166123, -0.18316704, -0.27840406));
	target1 += mul(nf1, float4x4(-0.15336679, -0.05767407, 0.13347702, 0.10092905, 0.09895612, -0.0839073, -0.16025528, -0.087642424, -0.101612955, 0.4119443, 0.031125817, -0.110090934, 0.056127027, -0.04000313, -0.042920932, 0.08100733));
	target1 += mul(ng1, float4x4(-0.113653034, -0.10163741, -0.058498476, -0.12347642, -0.20110545, -0.006300695, -0.1328342, -0.0071486877, 0.18334186, 0.15882389, -0.120586954, -0.04277906, -0.13593355, 0.11897087, 0.030404912, -0.23374279));
	target1 += mul(nh1, float4x4(0.044901595, -0.00010039519, -0.14989527, 0.025639903, 0.23985633, 0.0114784185, 0.056620862, -0.0599113, 0.017398749, 0.3567445, 0.10223932, -0.12609181, 0.0074833618, -0.16702464, -0.033638544, 0.062087793));
	target1 += mul(ni1, float4x4(-0.0302778, -0.009963125, 0.29761076, 0.08238972, 0.26467612, -0.19331805, -0.09930472, 0.23798122, 0.03599952, 0.24224155, 0.3041322, -0.054690234, 0.05582198, 0.0012778769, 0.041249134, -0.014496484));
	target1 += mul(na2, float4x4(-0.033623356, -0.18683043, -0.48352727, -0.09534184, 0.16657802, -0.31149274, -0.25840783, -0.16902964, -0.40347067, 0.046952717, 0.15677738, -0.14079048, 0.0444492, -0.012346084, -0.16768047, -0.07540055));
	target1 += mul(nb2, float4x4(0.2678487, 0.113161474, -0.19962314, 0.23060325, -0.28154588, -0.06956369, 0.08050926, -0.25503877, 0.12565655, 0.5497286, -0.18335307, -0.044097837, 0.058234677, 0.049816858, -0.021038791, 0.14644346));
	target1 += mul(nc2, float4x4(-0.008438418, 0.080761805, 0.06993718, 0.08508105, 0.11905285, 0.016726421, -0.16668561, 0.026911844, 0.041182615, 0.2760306, 0.18553418, 0.25386074, 0.11789433, 0.094213605, 0.15487063, 0.15375367));
	target1 += mul(nd2, float4x4(-0.10329284, 0.16198465, -0.0681889, -0.006294233, 0.4592297, -0.12816279, 0.19529971, 0.109294996, 0.043646853, 0.084326275, 0.0635968, -0.11471805, 0.44923568, -0.01125437, -0.19251052, -0.08885202));
	target1 += mul(ne2, float4x4(-0.108986676, 0.40908077, -0.31152573, -0.13468693, -0.10438951, -0.086357035, 0.13880713, -0.288345, 0.17497768, -0.08021166, 0.07815909, 0.17337689, 0.02700953, -0.016387407, 0.0053377734, 0.109923586));
	target1 += mul(nf2, float4x4(0.13881513, -0.21179448, -0.104762904, 0.019093828, -0.3383386, 0.14453639, -0.28122503, 0.19449967, -0.035691183, 0.21306588, -0.046144057, 0.17898172, -0.0035024916, -0.054061864, -0.03985455, 0.3264588));
	target1 += mul(ng2, float4x4(0.02336507, 0.20597245, 0.03627631, 0.04278966, -0.042182084, -0.26431814, 0.122881256, 0.34909293, -0.17958918, 0.050698034, 0.336547, 0.21614759, 0.19511287, -0.20311548, -0.13249207, -0.24043573));
	target1 += mul(nh2, float4x4(-0.025547924, 0.020525696, 0.375233, -0.02528368, -0.044973124, 0.13667387, -0.08506365, 0.34317508, 0.14618309, 0.108213425, 0.15557359, -0.05340479, -0.27103037, 0.12428249, -0.085362, -0.009073445));
	target1 += mul(ni2, float4x4(-0.09518274, 0.036228243, -0.2145168, 0.090918355, -0.20793489, 0.19843313, 0.06701371, -0.11499378, -0.033398125, -0.020169621, 0.057314273, 0.0027613493, -0.11993404, 0.12495525, -0.0151242195, 0.1896457));
	target1 += float4(-0.045150407, -0.034128085, 0.10230384, 0.074793644);

	float4 target2 = mul(a1, float4x4(0.10541986, -0.27021417, 0.30589217, -0.06793019, 0.0712113, -0.5818028, -0.09057832, 0.009519015, -0.07754299, 0.009050975, -0.08283811, -0.078837596, -0.008200866, 0.53291875, 0.22918138, 0.09433025));
	target2 += mul(b1, float4x4(0.35867104, 0.17056245, 0.28573632, 0.45787787, 0.054377224, 0.30656826, -0.13864343, 0.13956884, -0.052365527, -0.17660435, -0.14363506, -0.11313267, -0.15472592, -0.011637987, 0.3057005, 0.40122506));
	target2 += mul(c1, float4x4(-0.42738816, -0.13046122, -0.4223082, 0.32663476, -0.14648326, 0.056164477, 0.09366789, -0.046335716, 0.00401621, -0.008206323, -0.075975314, 0.046879925, 0.04891574, 0.08912198, 0.32541895, 0.014354832));
	target2 += mul(d1, float4x4(0.105501, -0.06999185, -0.023181506, 0.13587391, -0.10643463, 0.061667755, 0.24508677, 0.33984032, -0.1698376, -0.05051473, -0.29430416, -0.06635265, -0.031917162, 0.046488285, 0.17973569, -0.025048103));
	target2 += mul(e1, float4x4(-0.07685088, -0.035609607, 0.07060013, -0.19892506, 0.17084605, -0.19758354, -0.29233304, -0.19821644, -0.047398012, 0.12004138, 0.1643941, 0.043807004, 0.2513805, 0.13687916, 0.23235638, 0.00979058));
	target2 += mul(f1, float4x4(-0.2601253, -0.0010056786, -0.46147683, -0.117661044, -0.042538162, -0.012710203, -0.034079336, -0.08661733, -0.03908205, 0.104053, -0.045735247, -0.07916684, -0.021913078, -0.0035067864, -0.10581172, 0.1149));
	target2 += mul(g1, float4x4(0.0421786, 0.0099540735, -0.020447837, -0.27269018, -0.084229656, 0.04271779, 0.036794372, -0.18072419, 0.07743771, -0.109369494, -0.07608079, 0.2973058, -0.1602913, -0.10049883, 0.033048846, -0.3780618));
	target2 += mul(h1, float4x4(-0.38231418, 0.106174126, 0.07344471, -0.1979349, 0.093251124, -0.07658309, 0.08417288, 0.2981472, -0.047867708, 0.0097399205, -0.11213339, 0.1746439, 0.10045314, -0.030283177, 0.004107288, -0.16744147));
	target2 += mul(i1, float4x4(0.43939134, 0.14499938, 0.20161533, -0.0067911143, 0.098075844, 0.22099596, 0.099283025, -0.017734041, 0.112658866, -0.12010951, -0.13342896, 0.053806942, 0.017880073, 0.028821323, 0.0082069365, -0.053472634));
	target2 += mul(a2, float4x4(0.2429229, 0.012143042, -0.029962441, 0.017843649, 0.11972611, -0.07733264, -0.37523645, -0.19887479, -0.18222691, -0.31171882, -0.20578085, 0.040127717, 0.0842879, 0.12601142, -0.07302166, -0.033017557));
	target2 += mul(b2, float4x4(0.09666541, -0.053779975, -0.045221806, -0.06923458, -0.046158988, -0.12819108, -0.32956856, -0.15813568, 0.12464106, -0.42395857, 0.078095086, -0.12961964, -0.15057011, -0.041440632, 0.04221429, 0.08509352));
	target2 += mul(c2, float4x4(0.2505401, 0.023106987, -0.0001688444, -0.11545978, 0.044663083, -0.011316191, -0.024175104, 0.033631656, -0.13285598, -0.026969459, 0.02669494, 0.082885765, -0.036615327, 0.06434473, -0.059197906, -0.110084));
	target2 += mul(d2, float4x4(-0.12549014, 0.25078717, -0.06146062, 0.1406611, 0.13844866, 0.012716272, 0.07641059, 0.04245357, -0.09028008, -0.15924782, -0.14551707, -0.09782215, 0.05188703, -0.12323306, -0.20053494, -0.20062317));
	target2 += mul(e2, float4x4(-0.26341316, -0.16508758, 0.036919586, -0.17812039, -0.3016191, 0.06403582, 0.12948476, 0.110633194, 0.14551535, -0.09222706, -0.30942333, 0.20120445, 0.059902433, -0.17293817, -0.07280857, -0.36021966));
	target2 += mul(f2, float4x4(0.11032128, -0.024297172, -0.110301405, -0.09563319, -0.23266938, -0.009982061, 0.18834652, 0.0987435, -0.13652846, -0.025019212, 0.07672643, 0.017108513, -0.043844085, 0.02440773, -0.029404791, 0.034692347));
	target2 += mul(g2, float4x4(-0.048525557, -0.043118346, 0.12048513, 0.030609682, 0.16658829, 0.19444555, 0.113910906, 0.31148425, -0.1755198, -0.038910154, 0.084356636, 0.12969102, 0.01661835, 0.28915378, -0.032290917, -0.12997934));
	target2 += mul(h2, float4x4(-0.24347968, 0.032619976, 0.16692804, -0.046297006, 0.0901479, 0.060802385, 0.21347383, 0.29304698, 0.16361152, 0.19639444, 0.0054137907, 0.049575172, 0.20710163, 0.076565325, 0.34911337, 0.35831028));
	target2 += mul(i2, float4x4(-0.092651226, 0.045491215, 0.11757575, 0.11756375, -0.27768722, 0.010231745, 0.21116765, 0.024840422, 0.0051228474, -0.04532887, -0.013311027, 0.121157385, -0.1053527, -0.00010417442, -0.035180032, 0.2051271));
	target2 += mul(na1, float4x4(-0.055320628, 0.14249797, -0.13782813, -0.05412119, -0.043079898, -0.18216185, 0.13923723, -0.11468015, -0.09394785, 0.12044827, -0.05177875, 0.1349153, -0.03233552, 0.16400962, -0.11219184, 0.09460802));
	target2 += mul(nb1, float4x4(-0.018258873, -0.23629102, -0.140925, 0.10609654, 0.024990926, -0.31095183, 0.21505022, 0.0007466126, -0.062110204, 0.24764718, 0.0018352414, 0.03383791, -0.05727847, -0.006963949, -0.23087887, -0.2521535));
	target2 += mul(nc1, float4x4(-0.08928898, 0.21107556, 0.27720314, 0.3170095, 0.1569246, -0.07950364, -0.035353288, 0.0851358, 0.034223706, -0.1124521, 0.068468235, -0.1876728, -0.09508409, -0.03837469, -0.19909252, 0.09844746));
	target2 += mul(nd1, float4x4(0.04326774, -0.063746035, 0.13767312, 0.048762802, 0.14155331, -0.21800575, -0.22868122, -0.10928361, 0.15166105, 0.086240664, 0.110339195, -0.0039928076, 0.114750795, 0.19737157, -0.09005264, -0.10637459));
	target2 += mul(ne1, float4x4(0.023298614, 0.07140441, 0.029475417, -0.14667986, -0.017949682, 0.007795148, -0.044714145, -0.13990426, 0.03870307, -0.067750655, -0.11831945, -0.14363948, 0.00049597165, -0.18959905, -0.20256434, 0.0409640));
	target2 += mul(nf1, float4x4(0.18983524, 0.07018097, 0.015068278, -0.17990883, -0.12528846, -0.020557154, 0.0106482245, 0.08105856, 0.02577546, -0.25885943, 0.0061467723, -0.058998212, 0.045207195, 0.019213859, -0.021913687, -0.10641617));
	target2 += mul(ng1, float4x4(-0.005021213, -0.030781588, -0.08722711, 0.045172613, 0.13006134, 0.03640675, -0.18160394, 0.10903534, 0.1283007, 0.053212877, 0.15160874, -0.30678773, 0.0611477, 0.060609598, -0.21533446, 0.2817914));
	target2 += mul(nh1, float4x4(-0.06942382, -0.08785516, -0.018080644, 0.12124481, -0.0988795, 0.021093542, 0.015752183, 0.057520576, -0.1873821, -0.15041956, 0.12230656, -0.23798561, -0.16819417, 0.07222907, -0.01441512, 0.06420038));
	target2 += mul(ni1, float4x4(-0.0350732, -0.054145966, 0.008372502, -0.16092199, -0.0671371, 0.057495046, -0.08276416, 0.34617814, 0.11239629, -0.19681981, 0.16116115, 0.046944335, 0.09723501, -0.12488112, -0.031532682, 0.013095191));
	target2 += mul(na2, float4x4(-0.2309171, 0.10420613, -0.12122516, -0.04000454, -0.20740104, -0.010152015, 0.26092738, 0.13527256, 0.08665683, -0.18393658, -0.030344693, -0.10654187, 0.07108977, -0.28212613, 0.024101965, -0.22189055));
	target2 += mul(nb2, float4x4(0.06602971, 0.050674047, 0.33251405, -0.07886978, -0.13822217, -0.014285523, 0.22478761, 0.22517748, -0.1175651, 0.11234997, -0.17835312, 0.010875831, 0.20007257, 0.21565825, 0.30876723, -0.029953295));
	target2 += mul(nc2, float4x4(0.3083618, 0.12779777, 0.112711206, 0.001815444, -0.123584166, 0.03232661, -0.060439207, -0.13411477, 0.30604517, -0.19359338, -0.115064435, -0.03826723, 0.16092177, -0.07926006, -0.27355558, 0.077829085));
	target2 += mul(nd2, float4x4(-0.020265967, -0.27894706, -0.105033666, -0.10975655, 0.20102961, 0.024541473, 0.21834314, -0.21726306, -0.01132585, -0.16459125, 0.21980706, 0.039996378, -0.15850788, 0.16646145, 0.10387183, -0.35103965));
	target2 += mul(ne2, float4x4(-0.038195442, 0.02967505, -0.22234862, -0.040221542, 0.06056814, 0.14282827, -0.26034078, 0.32477978, -0.45779508, -0.3667849, 0.22392158, 0.09866475, -0.096611015, 0.12282537, 0.080877006, -0.038721707));
	target2 += mul(nf2, float4x4(0.12205649, 0.052729234, 0.09086409, 0.13457046, -0.24082763, -0.008418334, -0.24735104, 0.13281673, 0.049058694, 0.046168383, -0.049963474, 0.09272115, 0.12703685, 0.020337742, -0.20470645, -0.07379872));
	target2 += mul(ng2, float4x4(0.02244616, 0.058318693, -0.05570221, -0.02717316, 0.14189804, -0.0016504574, 0.018723257, -0.05787106, 0.055331856, 0.0030448188, 0.01664426, 0.080254346, -0.15860988, -0.10147442, 0.115529425, -0.12332509));
	target2 += mul(nh2, float4x4(0.16019078, -0.20631735, -0.018190302, 0.0647328, -0.04840569, 0.083106056, -0.13247506, -0.2112572, -0.10423932, -0.12388437, 0.1951962, 0.15236832, -0.075027406, -0.12183809, -0.07161853, -0.24558437));
	target2 += mul(ni2, float4x4(-0.06832158, 0.06699966, -0.17887384, 0.025053928, 0.22054252, -0.03332688, -0.089027286, -0.0743864, -0.019737093, 0.1890527, 0.3194981, -0.014847898, 0.0616053, -0.046331815, -0.013838972, -0.19598661));
	target2 += float4(0.0031252617, 0.028414045, -0.018389644, 0.011216021);

	tex3[gxy] = target1;
	tex4[gxy] = target2;
}


//!PASS 3
//!DESC Conv-4x3x3x16
//!IN tex3, tex4
//!OUT tex1, tex2
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass3(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	float4 a1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	float4 b1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 c1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 d1 = tex3.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 e1 = tex3.SampleLevel(sam, pos, 0);
	float4 f1 = tex3.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 g1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 h1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 i1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	float4 na1 = max(-a1, 0);
	float4 nb1 = max(-b1, 0);
	float4 nc1 = max(-c1, 0);
	float4 nd1 = max(-d1, 0);
	float4 ne1 = max(-e1, 0);
	float4 nf1 = max(-f1, 0);
	float4 ng1 = max(-g1, 0);
	float4 nh1 = max(-h1, 0);
	float4 ni1 = max(-i1, 0);

	a1 = max(a1, 0);
	b1 = max(b1, 0);
	c1 = max(c1, 0);
	d1 = max(d1, 0);
	e1 = max(e1, 0);
	f1 = max(f1, 0);
	g1 = max(g1, 0);
	h1 = max(h1, 0);
	i1 = max(i1, 0);

	float4 a2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	float4 b2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 c2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 d2 = tex4.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 e2 = tex4.SampleLevel(sam, pos, 0);
	float4 f2 = tex4.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 g2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 h2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 i2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	float4 na2 = max(-a2, 0);
	float4 nb2 = max(-b2, 0);
	float4 nc2 = max(-c2, 0);
	float4 nd2 = max(-d2, 0);
	float4 ne2 = max(-e2, 0);
	float4 nf2 = max(-f2, 0);
	float4 ng2 = max(-g2, 0);
	float4 nh2 = max(-h2, 0);
	float4 ni2 = max(-i2, 0);

	a2 = max(a2, 0);
	b2 = max(b2, 0);
	c2 = max(c2, 0);
	d2 = max(d2, 0);
	e2 = max(e2, 0);
	f2 = max(f2, 0);
	g2 = max(g2, 0);
	h2 = max(h2, 0);
	i2 = max(i2, 0);

	float4 target1 = mul(a1, float4x4(0.1156422, 0.13656664, 0.23103227, -0.09881847, -0.13118152, 0.063764885, -0.1902535, 0.12580052, -0.057555363, 0.0015611092, 0.009383415, 0.0028447553, -0.12577637, 0.06707094, 0.05323591, 0.087465174));
	target1 += mul(b1, float4x4(0.023715734, 0.15901619, 0.010465818, -0.05401794, 0.12822664, -0.079860024, -0.107430205, -0.09094713, 0.11440009, -0.069189526, 0.1377121, -0.02780827, -0.2594948, -0.008447683, -0.052618783, 0.0995311));
	target1 += mul(c1, float4x4(0.014655754, 0.0976315, -0.10425098, 0.06731683, -0.07336922, -0.09931748, -0.074338034, 0.014602733, 0.0761052, -0.14147633, 0.057346404, -0.10485628, 0.008160006, -0.14553718, -0.14069714, -0.106754564));
	target1 += mul(d1, float4x4(-0.18000032, 0.2654082, 0.07008131, -0.21326934, -0.11475177, 0.110427424, -0.09757059, -0.068473235, 0.14004572, -0.1257574, -0.18653339, -0.0546973, -0.04573617, 0.0062926346, -0.111400455, 0.20940857));
	target1 += mul(e1, float4x4(-0.018083753, 0.2091146, -0.12149297, -0.20310159, 0.19642518, 0.008668434, 0.30470127, 0.080623224, -0.04213514, 0.114459425, -0.20325947, 0.024065504, 0.4724302, -0.12169043, 0.22899939, 0.16189654));
	target1 += mul(f1, float4x4(0.22153069, -0.13286535, 0.21529129, 0.059222966, -0.010648649, -0.07542803, 0.12650701, 0.107978106, -0.0122471545, -0.12456761, -0.05047403, 0.052241012, -0.18476847, 0.023691572, 0.16347644, -0.10157776));
	target1 += mul(g1, float4x4(0.053245157, 0.23913434, -0.06288426, -0.15678102, -0.09103809, -0.070054255, -0.021768395, 0.012513, 0.105658144, -0.2088671, -0.03485171, -0.07802848, -0.08754643, 0.0675039, -0.10190519, 0.03442446));
	target1 += mul(h1, float4x4(-0.028817, 0.11284706, 0.13998732, -0.015143216, -0.03416565, 0.09102063, 0.11161235, 0.08467392, 0.16325544, 0.14942992, -0.12313727, -0.06640328, -0.0750008, -0.018136598, -0.23112826, -0.006661416));
	target1 += mul(i1, float4x4(0.017297093, 0.07989559, 0.13549612, 0.07035857, -0.25493076, 0.061273754, 0.052633338, 0.0782014, -0.17994808, -0.14367908, 0.098241016, -0.07993234, -0.010386358, -0.102339104, 0.023344131, -0.08682215));
	target1 += mul(a2, float4x4(0.36794287, -0.048137277, -0.3692417, 0.07832, -0.023172008, 0.02877666, -0.23517531, 0.1448923, 0.09313475, -0.27063283, 0.028388552, 0.17988816, 0.1006075, 0.028261969, -0.10012888, -0.10348935));
	target1 += mul(b2, float4x4(-0.06629671, 0.35957095, -0.21791938, -0.12429962, 0.054654542, 0.05988639, -0.32374984, 0.009501225, -0.26171863, 0.042992886, 0.29698196, 0.08521328, 0.15199377, 0.16362138, -0.18785295, -0.049852755));
	target1 += mul(c2, float4x4(0.15766738, -0.04841046, 0.14447841, 0.17353393, -0.008089345, 0.04590437, -0.043384884, 0.002877719, 0.08845935, 0.039423246, -0.14808795, -0.03975318, 0.2653877, -0.20700884, 0.07218189, -0.10878484));
	target1 += mul(d2, float4x4(0.11222389, 0.2779044, 0.0847275, -0.16267867, 0.17030342, 0.05503266, -0.22644295, -0.23563059, 0.41185054, -0.43625602, -0.18901125, 0.6115694, -0.084791176, -0.01684559, 0.19077617, -0.07168747));
	target1 += mul(e2, float4x4(0.015268929, -0.14208716, 0.15536898, -0.11922906, -0.021667667, -0.078210905, 0.023766499, -0.18069603, -0.06938558, -0.023576038, -0.2990819, 0.11863158, -0.05013765, 0.061508566, -0.085189775, 0.07901883));
	target1 += mul(f2, float4x4(0.13318339, 0.29247984, 0.14075997, 0.08248716, 0.3436642, -0.099461004, -0.17356718, -0.029998098, 0.11614284, 0.20115575, 0.04850254, -0.109567694, -0.090151444, 0.06976889, 0.12614332, 0.097242));
	target1 += mul(g2, float4x4(0.102283016, 0.2969136, -0.059127506, 0.06053867, 0.102346785, 0.061365493, -0.09023823, -0.14396398, 0.04298546, -0.10845686, -0.16071963, 0.05240062, 0.00294458, 0.01617549, 0.30480185, -0.0020818028));
	target1 += mul(h2, float4x4(0.022530032, -0.04770017, 0.16849731, 0.2684958, -0.20493472, 0.26375678, -0.08210537, 0.11594341, 0.12630959, -0.33804628, -0.066290505, -0.21235433, -0.11481554, 0.045285236, 0.009036264, -0.009541344));
	target1 += mul(i2, float4x4(0.22221607, 0.19683546, 0.088301376, 0.07007941, 0.42560205, -0.2515224, 0.10263357, 0.17257528, -0.025208276, 0.09696816, 0.07462843, -0.1663459, 0.14332424, -0.04554422, 0.1857485, 0.19819035));
	target1 += mul(na1, float4x4(-0.33422568, 0.22908518, -0.052035328, 0.0022050992, 0.22068155, -0.31737608, 0.11867548, -0.1062603, 0.21229419, 0.0637268, 0.06284452, 0.075321406, -0.0017977909, -0.24026957, 0.08011851, -0.016301792));
	target1 += mul(nb1, float4x4(0.18647133, -0.042395514, -0.21644959, 0.020428998, -0.19073069, 0.037881456, 0.15364948, 0.13242447, -0.30524725, 0.056054097, -0.03914103, 0.030670341, -0.0010289366, -0.03421297, 0.34305614, 0.078916825));
	target1 += mul(nc1, float4x4(-0.061559163, 0.33350998, -0.040633813, -0.1973531, -0.17371178, 0.020277103, -0.024941592, 0.06309346, 0.10086231, -0.07366512, 0.16570221, 0.20248237, -0.23286462, 0.2155677, 0.15136743, 0.05190251));
	target1 += mul(nd1, float4x4(-0.089644894, 0.13512145, -0.09810823, 0.1616594, 0.16190928, -0.35417703, -0.05601066, 0.20318456, 0.17348176, 0.074274324, 0.029394915, 0.15095772, 0.12337869, 0.029932164, 0.04123706, -0.049648866));
	target1 += mul(ne1, float4x4(0.46952993, 0.14834478, -0.11927866, 0.07611556, -0.2967575, -0.030506441, -0.1524667, -0.16106017, -0.38649827, 0.18501776, 0.07677004, -0.0828538, -0.43983704, -0.15083657, -0.118309684, 0.13656397));
	target1 += mul(nf1, float4x4(-0.04939808, 0.53252345, 0.12711428, -0.38512766, -0.20486577, 0.031688303, -0.18231112, -0.019054607, -0.034855623, -0.05244254, -0.1425771, 0.0892418, 0.046889585, 0.1430025, -0.12742822, 0.092776656));
	target1 += mul(ng1, float4x4(-0.105744444, -0.10247078, -0.02144931, -0.09396661, -0.03536793, -0.027341979, -0.103435315, 0.12214116, -0.13862023, 0.037751865, 0.40586975, 0.023863355, -0.12592442, -0.0762698, 0.008515978, 0.1552095));
	target1 += mul(nh1, float4x4(0.018858416, 0.053681094, 0.16911085, -0.29219922, -0.182029, 0.02297272, -0.30588147, -0.18948974, -0.05744442, 0.0065371646, 0.16328862, -0.051437955, 0.13113242, -0.07573973, -0.047258016, 0.0882382));
	target1 += mul(ni1, float4x4(-0.021155104, 0.07440132, -0.06681412, -0.20775446, 0.053573515, 0.007910367, -0.26769453, -0.15753269, 0.24886242, -0.004493456, 0.023437606, 0.13257046, 0.104298666, 0.14052817, -0.29093856, 0.006735399));
	target1 += mul(na2, float4x4(-0.1299053, 0.21084401, 0.07395335, 0.025556391, -0.012464804, 0.090624444, -0.1041891, 0.03487812, -0.012958428, -0.22729388, 0.06259986, -0.1693054, -0.12679845, -0.15950051, -0.13191415, 0.1125045));
	target1 += mul(nb2, float4x4(0.1916771, -0.02030791, -0.2001191, 0.01943065, -0.18369348, -0.054252382, -0.11485618, -0.16434757, 0.0587951, 0.15208498, -0.1752913, 0.03718008, -0.07597363, -0.21144252, -0.049415894, -0.010295923));
	target1 += mul(nc2, float4x4(-0.044603452, 0.019383559, -0.24661145, -0.12994917, 0.12697428, -0.13032277, -0.15293793, -0.03483303, -0.104321085, 0.04012559, 0.037243072, 0.079595305, -0.12313407, 0.118987724, 0.038709577, 0.09531991));
	target1 += mul(nd2, float4x4(-0.021859067, 0.009060085, 0.19879933, 0.21082644, -0.07705756, 0.10045584, -0.075999945, 0.15191688, -0.12042984, -0.11578441, 0.29679164, -0.23787339, 0.1087794, -0.1419117, -0.22779143, 0.12054577));
	target1 += mul(ne2, float4x4(0.16636065, 0.21066229, -0.06262401, 0.051833395, 0.05992027, 0.014294402, -0.13363211, -0.11139326, -0.026526988, -0.2071816, -0.03000262, -0.08924753, 0.0979992, -0.08312352, -0.016549548, -0.034920745));
	target1 += mul(nf2, float4x4(0.099836424, -0.19452114, 0.07249264, -0.025459828, 0.12210845, -0.15024027, -0.06490785, -0.080187015, -0.009426102, 0.15876383, -0.19070506, 0.12257102, 0.04862195, 0.0707773, -0.24345201, -0.103591055));
	target1 += mul(ng2, float4x4(-0.039747223, 0.07834283, 0.13246708, -0.021774938, -0.05476214, 0.07021812, 0.0134778535, 0.003289531, 0.11907656, 0.04191671, 0.04860092, -0.041503876, -0.040156245, -0.21329322, 0.2024782, 0.067827046));
	target1 += mul(nh2, float4x4(-0.036722995, 0.12776081, 0.14014143, 0.09107308, 0.18742307, -0.099873625, -0.13149267, -0.18590397, -0.067778006, 0.16363877, -0.007999648, 0.13500053, 0.23733437, 0.16123019, 0.23561893, 0.0365712));
	target1 += mul(ni2, float4x4(0.023911275, -0.03754323, 0.17444386, 0.08616114, 0.21406639, -0.15029684, 0.09355591, -0.2486941, 0.11913366, -0.16174106, -0.10907662, 0.107935205, -0.20745984, -0.06180981, -0.019558005, -0.24215329));
	target1 += float4(-0.16255508, -0.041602854, 0.09628627, 0.12747966);

	float4 target2 = mul(a1, float4x4(0.14002717, 0.058876935, 0.20110254, 0.08939276, 0.03416418, 0.0011943586, 0.042772148, -0.00071322336, -0.115944035, 0.04220234, -0.34941152, -0.01974448, -0.0860279, 0.062355816, -0.023853427, 0.02757322));
	target2 += mul(b1, float4x4(0.07400734, 0.19251242, 0.22637455, -0.12530822, -0.17724502, -0.022523593, -0.15113536, 0.065425, 0.101782374, -0.014717139, -0.098752305, 0.080687046, -0.1023507, 0.019614108, 0.01754361, 0.017383952));
	target2 += mul(c1, float4x4(-0.044900224, -0.04213899, 0.0073328684, 0.16705592, -0.051043745, -0.115500204, -0.07567362, 0.07818187, 0.26050508, 0.20679274, 0.04177571, 0.059024576, -0.12510507, -0.051585447, -0.007354538, 0.041514263));
	target2 += mul(d1, float4x4(0.19596866, -0.085393354, 0.03522195, 0.070734546, -0.10047298, 0.033123884, -0.030003218, -0.060309574, 0.11121212, 0.038920198, -0.09097313, 0.020515997, 0.082481235, 0.08472773, -0.007372676, 0.020294813));
	target2 += mul(e1, float4x4(-0.08415041, -0.2041298, -0.0834695, -0.18762465, 0.26823425, -0.029255247, 0.21203867, 0.01842292, 0.17127061, -0.14378369, 0.18486983, 0.040807612, 0.053938765, -0.0033184371, 0.021192972, -0.28285155));
	target2 += mul(f1, float4x4(-0.071444504, -0.16073905, 0.03151272, 0.31961456, -0.09696413, -0.14652419, 0.012872177, 0.036853626, 0.055909842, 0.023814479, 0.12539348, 0.40904784, 0.065472044, -0.04875745, -0.012401859, 0.055437304));
	target2 += mul(g1, float4x4(-0.020927057, -0.23479983, -0.073076054, -0.019441728, 0.08953939, 0.00085565075, 0.061437223, -0.0912304, 0.088546015, -0.009464413, -0.21220255, -0.13741408, 0.049379412, -0.059064344, 0.019205336, -0.11340151));
	target2 += mul(h1, float4x4(0.091714375, -0.17525947, 0.10243093, 0.037679292, 0.062438603, -0.05920895, -0.041936304, -0.030830177, 0.15641114, -0.13261372, -0.021079037, -0.036029477, 0.051840104, 0.07784452, 0.024798041, -0.079719625));
	target2 += mul(i1, float4x4(0.09153048, 0.09966556, -0.10249195, 0.062159285, -0.041912418, -0.22329834, 0.06683857, -0.07287391, -0.1276734, -0.108105786, -0.076660454, -0.07083524, 0.115786545, 0.043516885, 0.032041304, 0.058955755));
	target2 += mul(a2, float4x4(0.13925591, 0.18807505, 0.19418481, 0.13057134, -0.18483852, 0.07704087, -0.25748852, -0.008577424, 0.0165214, 0.03893396, 0.081021786, -0.19419926, 0.21641012, 0.047428373, -0.08350786, -0.14157358));
	target2 += mul(b2, float4x4(-0.06301399, -0.10051874, 0.050919298, -0.011019032, -0.09310829, -0.09138247, -0.16847654, 0.059362046, 0.09107295, 0.06165534, -0.14288484, -0.09833287, 0.116363674, 0.20607105, 0.28841344, -0.09095499));
	target2 += mul(c2, float4x4(-0.21624683, -0.01876206, 0.008987255, 0.17512046, -0.07461909, 0.124108806, -0.054439757, 0.0063252384, -0.24328436, 0.12330878, -0.09306248, -0.046553027, 0.07773235, 0.08965016, 0.0025699693, 0.06252218));
	target2 += mul(d2, float4x4(0.17797774, -0.0768457, -0.06500614, 0.010914941, 0.052788664, 0.10169022, -0.11962388, -0.10176263, -0.52695477, 0.10339165, 0.12893896, 0.016989866, -0.070845306, 0.011061218, -0.033032518, 0.13843493));
	target2 += mul(e2, float4x4(0.4498575, 0.3626344, -0.18857695, 0.12901132, 0.050753895, 0.03323978, -0.15807427, 0.050633483, -0.35924155, 0.13558777, 0.07132256, -0.20883714, 0.23128356, 0.2943383, 0.011521201, -0.21517687));
	target2 += mul(f2, float4x4(-0.007034323, -0.08821435, -0.1275898, -0.15626103, 0.1458542, 0.26724494, -0.118883595, -0.0062981425, 0.07331739, -0.061295208, -0.008509335, -0.012484612, -0.010828551, -0.11301564, -0.078878716, -0.07692456));
	target2 += mul(g2, float4x4(-0.17712432, 0.020956295, 0.118008055, 0.09609794, 0.22146885, 0.20994097, -0.11431106, -0.10710715, -0.15350081, 0.118692145, -0.028190786, 0.021440385, 0.053412, -0.06350743, -0.03998433, 0.061913643));
	target2 += mul(h2, float4x4(-0.07220576, 0.21927893, 0.029267995, 0.107059665, 0.114823125, -0.115261756, -0.18801664, 0.04473252, -0.055653024, 0.11297751, 0.15545851, -0.012991604, 0.1803409, 0.1982345, -0.07486266, -0.09845943));
	target2 += mul(i2, float4x4(-0.0855076, 0.014239223, 0.15630183, 0.21274531, -0.24398185, -0.039692834, -0.167163, 0.09103569, 0.029505143, 0.0986762, 0.015726546, 0.015572646, 0.16977786, -0.08617271, 0.13340445, -0.14292516));
	target2 += mul(na1, float4x4(-0.07120758, -0.1391182, -0.12895927, -0.05497231, 0.017502422, 0.21387358, 0.11369438, -0.09802215, 0.23512627, -0.18750496, 0.3741736, 0.07218814, 0.050294157, -0.03545248, 0.1803603, -0.05216715));
	target2 += mul(nb1, float4x4(-0.031216163, 0.26304567, -0.22097221, 0.0057130447, 0.05476227, 0.048769098, 0.11701435, -0.08043882, 0.121324, -0.07633719, 0.019091062, 0.1056272, 0.19340484, -0.11655276, -0.06859909, -0.20875669));
	target2 += mul(nc1, float4x4(-0.1303287, 0.23683752, -0.14536002, -0.12238158, -0.024545986, -0.09032069, 0.03192402, -0.22449107, 0.2297885, 0.02040227, 0.00034511733, -0.0878228, 0.184152, -0.070972465, -0.010276752, -0.1974931));
	target2 += mul(nd1, float4x4(-0.345411, -0.088238314, -0.020721637, -0.19773935, -0.08967216, 0.11257784, 0.11590796, 0.047473334, 0.20315827, 0.08028863, -0.053076692, 0.04220213, 0.0463197, -0.11993164, 0.17273119, -0.10105775));
	target2 += mul(ne1, float4x4(0.01774352, -0.029116748, -0.070671946, 0.03868912, -0.23905252, 0.122819565, -0.13782008, -0.11386684, -0.15104173, 0.06922476, -0.40653947, -0.041311335, 0.03382718, 0.17504995, 0.19865142, 0.20958701));
	target2 += mul(nf1, float4x4(0.019477593, -0.13480781, -0.15261935, -0.29111782, -0.009433358, 0.07510615, -0.07673836, -0.092863046, -0.15928364, -0.18979515, 0.23357031, -0.096665405, 0.017931713, 0.15517262, -0.045679327, -0.13043073));
	target2 += mul(ng1, float4x4(0.009786184, 0.23618346, 0.08964326, -0.07550377, -0.21214269, 0.008612741, 0.012998613, 0.08797401, 0.16580902, 0.018369747, 0.31754863, 0.094271086, -0.3186572, 0.013351233, -0.04407326, 0.0920314));
	target2 += mul(nh1, float4x4(-0.025626086, 0.09697167, -0.013395247, -0.080764554, -0.19025484, 0.25081167, -0.008351234, 0.009649054, -0.045282297, 0.02762338, 0.09182815, -0.015618593, -0.24248622, -0.0027028685, -0.026439957, 0.06903493));
	target2 += mul(ni1, float4x4(0.15144084, 0.09893225, 0.18078536, -0.40492618, 0.006812688, 0.20841157, -0.052535042, -0.03471349, 0.07722477, 0.18913163, 0.06806257, 0.13268931, -0.23726766, -0.06573527, -0.07974115, 0.00016083609));
	target2 += mul(na2, float4x4(-0.22123417, 0.043395992, -0.075050056, 0.040263254, 0.05219495, -0.10119571, 0.06624045, 0.006088249, -0.02443482, 0.22211014, 0.11706287, 0.09821594, -0.26269525, -0.045644283, 0.1594094, 0.05119857));
	target2 += mul(nb2, float4x4(-0.1359838, 0.085772105, -0.14989698, 0.22662053, -0.13730896, 0.13598563, -0.22069088, -0.049138095, -0.11819638, 0.00615722, 0.22080155, -0.18276499, 0.13765272, 0.026108319, -0.16875726, -0.04851573));
	target2 += mul(nc2, float4x4(-0.23633143, -0.04675013, 0.13207665, 0.17955893, -0.057579413, -0.007248268, -0.11771674, 0.053317282, 0.06935881, -0.07843104, -0.051989514, -0.101527795, 0.030873962, 0.05374762, 0.15865721, -0.11873757));
	target2 += mul(nd2, float4x4(-0.17574823, 0.116152145, 0.038584445, 0.06896235, 0.045519844, -0.003343947, -0.18241419, -0.0559283, 0.1285456, -0.06100108, 0.072168864, 0.2383614, 0.06786445, -0.110831186, 0.0017635048, -0.11216164));
	target2 += mul(ne2, float4x4(-0.22214325, -0.16752025, 0.39590892, 0.0366774, -0.09062008, 0.04298391, -0.2098661, -0.007913526, 0.27807632, -0.0072328355, -0.123739436, 0.017585058, -0.0792693, -0.012500297, -0.0028807693, -0.0010119011));
	target2 += mul(nf2, float4x4(0.014059116, 0.19940482, 0.16831028, 0.16160843, -0.23937507, -0.0070899655, 0.05102661, 0.14583974, 0.04344956, 0.21863829, 0.014209773, -0.063842624, -0.19981036, 0.09243793, 0.24139273, 0.11667779));
	target2 += mul(ng2, float4x4(0.16715737, -0.09880053, 0.00053459726, -0.08722921, -0.050105397, -0.01993378, -0.15830508, -0.028736366, -0.03423738, -0.13328381, -0.1851269, 0.012596559, 0.16408625, 0.10486815, -0.011303046, -0.025475042));
	target2 += mul(nh2, float4x4(0.118060954, -0.24267668, -0.0098548755, -0.04774737, -8.479728e-05, 0.11292645, -0.05507332, -0.20990159, -0.16743746, -0.17963362, -0.14095132, 0.19843975, -0.032164577, -0.21628135, -0.12668937, -0.008645119));
	target2 += mul(ni2, float4x4(0.11424831, -0.19821498, 0.016948126, 0.0033053497, 0.24253003, 0.24522384, -0.13992928, 0.08576702, -0.15157521, -0.08158828, 0.07676344, -0.08844756, -0.02293248, -0.052961793, 0.08597288, -0.07834255));
	target2 += float4(-0.07366732, -0.06278686, 0.11547288, -0.04786791);

	tex1[gxy] = target1;
	tex2[gxy] = target2;
}


//!PASS 4
//!DESC Conv-4x3x3x16
//!IN tex1, tex2
//!OUT tex3, tex4
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass4(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	float4 a1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	float4 b1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 c1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 d1 = tex1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 e1 = tex1.SampleLevel(sam, pos, 0);
	float4 f1 = tex1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 g1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 h1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 i1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	float4 na1 = max(-a1, 0);
	float4 nb1 = max(-b1, 0);
	float4 nc1 = max(-c1, 0);
	float4 nd1 = max(-d1, 0);
	float4 ne1 = max(-e1, 0);
	float4 nf1 = max(-f1, 0);
	float4 ng1 = max(-g1, 0);
	float4 nh1 = max(-h1, 0);
	float4 ni1 = max(-i1, 0);

	a1 = max(a1, 0);
	b1 = max(b1, 0);
	c1 = max(c1, 0);
	d1 = max(d1, 0);
	e1 = max(e1, 0);
	f1 = max(f1, 0);
	g1 = max(g1, 0);
	h1 = max(h1, 0);
	i1 = max(i1, 0);

	float4 a2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	float4 b2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 c2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 d2 = tex2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 e2 = tex2.SampleLevel(sam, pos, 0);
	float4 f2 = tex2.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 g2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 h2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 i2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	float4 na2 = max(-a2, 0);
	float4 nb2 = max(-b2, 0);
	float4 nc2 = max(-c2, 0);
	float4 nd2 = max(-d2, 0);
	float4 ne2 = max(-e2, 0);
	float4 nf2 = max(-f2, 0);
	float4 ng2 = max(-g2, 0);
	float4 nh2 = max(-h2, 0);
	float4 ni2 = max(-i2, 0);

	a2 = max(a2, 0);
	b2 = max(b2, 0);
	c2 = max(c2, 0);
	d2 = max(d2, 0);
	e2 = max(e2, 0);
	f2 = max(f2, 0);
	g2 = max(g2, 0);
	h2 = max(h2, 0);
	i2 = max(i2, 0);

	float4 target1 = mul(a1, float4x4(0.012843345, 0.047590222, 0.0052741203, 0.017328946, 0.06774971, -0.028615275, 0.030839639, 0.053735327, -0.093057916, 0.08288735, 0.02991863, -0.040167376, 0.11699043, 0.062987246, 0.038180597, 0.11130321));
	target1 += mul(b1, float4x4(0.047898952, 0.013089616, 0.13206771, 0.053474475, -0.24849094, -0.13717765, -0.14899106, 0.032647215, -0.111574546, -0.017941473, 0.017136412, -0.04121033, 0.04825172, -0.07243479, -0.30205736, -0.009043054));
	target1 += mul(c1, float4x4(-0.006104078, -0.056147296, -0.05430816, -0.012150009, -0.12583707, -0.06810525, -0.18965304, -0.03767409, 0.038220566, 0.12901759, 0.14772348, -0.011318772, 0.10613474, -0.011039028, -0.017407915, -0.035597485));
	target1 += mul(d1, float4x4(0.070510425, 0.07079898, 0.063229784, 0.12203576, -0.08330366, 0.14733653, 0.1879776, 0.038365003, -0.112844236, 0.039776023, 0.1109856, -0.013311713, -0.039772045, 0.055393253, 0.13704132, -0.017909162));
	target1 += mul(e1, float4x4(0.01609076, 0.29408732, -0.27179766, 0.06092111, 0.22690177, -0.16917813, -0.3125674, -0.059012905, -0.095299855, -0.07046006, -0.03500062, 0.14539354, -0.031449903, -0.020992422, 0.11367832, -0.16279401));
	target1 += mul(f1, float4x4(-0.014307108, 0.066424, -0.10264224, 0.03198627, 0.1848716, 0.0827183, 0.055873994, -0.08671376, -0.09059771, -0.0033756928, 0.015373264, 0.1482131, -0.22886358, 0.14303732, 0.060101535, -0.056595195));
	target1 += mul(g1, float4x4(-0.085188106, 0.07173675, 0.112149395, 0.12051379, -0.008070544, 0.17174324, 0.09781431, 0.15725529, -0.11921908, -0.026167218, 0.0726004, 0.1150364, 0.07129521, 0.08404156, -0.06052682, -0.024796983));
	target1 += mul(h1, float4x4(-0.118749954, 0.21372789, 0.1825785, 0.12766796, -0.20407347, -0.31983793, 0.06569954, 0.061804183, -0.1381503, -0.2288562, 0.010778781, 0.046662748, 0.09632992, -0.0007208436, 0.042766806, -0.008586152));
	target1 += mul(i1, float4x4(-0.08471548, 0.11370638, 0.044628102, 0.21962023, -0.1537214, 0.018495824, -0.10132307, 0.055931155, -0.19381963, -0.029650096, -0.12020838, 0.14787269, 0.10709368, 0.091088474, -0.08593706, 0.02723246));
	target1 += mul(a2, float4x4(0.023070067, 0.047927327, -0.0039206124, 0.044357426, 0.078707196, -0.02090998, 0.061532218, -0.01990171, -0.0010075673, -0.02985451, -0.013571645, 0.072454736, -0.08910195, 0.08069201, 0.021186491, -0.015898732));
	target1 += mul(b2, float4x4(0.19465011, -0.099643335, -0.13729279, -0.01785864, 0.07081408, -0.03980578, -0.055030484, -0.007838133, 0.02866604, 0.047467582, 0.0021829177, -0.0085278815, -0.14039196, 0.14613628, -0.08654854, 0.091417976));
	target1 += mul(c2, float4x4(0.2008973, -0.055368304, -0.11570937, -0.020534834, 0.029072378, 0.057559345, -0.12295086, -0.093348056, -0.032486536, -0.024021279, -0.03250597, 0.03629745, -0.08590457, -0.037932087, -0.21787491, 0.06611054));
	target1 += mul(d2, float4x4(0.0013978226, 0.12190444, -0.1388371, 0.053365257, 0.06383916, -0.16512986, 0.020202242, -0.05118216, -0.022544125, 0.022348702, -0.04619122, -0.007816115, 0.16181955, -0.087810166, -0.017245274, 0.2592078));
	target1 += mul(e2, float4x4(-0.29257166, 0.18668509, 0.39435357, -0.015695287, 0.052169085, 0.08033462, -0.06759564, 0.15172167, -0.07392426, 0.08598093, -0.099814445, 0.16442427, -0.23507537, 0.00095621345, 0.09456823, 0.35083038));
	target1 += mul(f2, float4x4(0.09508197, -0.10668374, 0.07861556, -0.18495509, -0.012995353, 0.10549121, 0.20355113, 0.02486487, -0.0010891877, 0.0013024746, 0.040683478, 0.09813279, -0.25718254, -0.080950156, -0.20833632, -0.011176342));
	target1 += mul(g2, float4x4(0.04636551, 0.01815646, -0.061344985, 0.16105172, 0.018154364, 0.08175996, 0.02177905, 0.05214974, 0.056760095, 0.056198932, -0.01944339, 0.10342066, 0.037774805, -0.098509185, -0.050058816, 0.22327778));
	target1 += mul(h2, float4x4(0.3342538, 0.24596402, -0.05070882, -0.1629279, -0.0605624, -0.31846803, -0.030116247, 0.14499578, 0.23033214, 0.100796476, -0.11549748, 0.13272488, 0.09768287, -0.08599002, -0.18570031, -0.095745035));
	target1 += mul(i2, float4x4(0.017806288, 0.03143078, 0.1363342, -0.018307902, 0.036575943, -0.04645106, -0.13187204, -0.019356936, 0.08177283, 0.14059572, -0.026990665, -0.025628868, 0.089009784, -0.054094527, -0.10889895, -0.08352851));
	target1 += mul(na1, float4x4(-0.10441912, 0.06942166, 0.021075722, 0.022823252, 0.14455585, -0.10067584, -0.006786432, -0.15945506, 0.051149122, -0.051351603, -0.012551037, 0.017784216, -0.030743994, 0.06534117, -0.05894921, -0.007193482));
	target1 += mul(nb1, float4x4(-0.105177015, 0.12079406, -0.021824203, 0.0051873215, 0.09426312, 0.0872351, 0.042457238, -0.027718134, -0.04744092, -0.036118995, -0.088347785, 0.025714433, -0.0033455554, 0.0052299164, 0.14114419, -0.23041077));
	target1 += mul(nc1, float4x4(-0.10924918, 0.07170065, 0.15847342, 0.045235954, 0.01170718, 0.09113452, 0.155801, 0.012455027, 0.0091770645, -0.071032606, -0.06911904, -0.0078831315, 0.27796802, -0.08136213, 0.20615137, -0.22055252));
	target1 += mul(nd1, float4x4(0.02993543, -0.011065637, 0.015992155, -0.106134124, -0.26578894, 0.16489314, 0.0020848098, 0.12432517, -0.14845847, 0.11076599, -0.015617476, 0.12498255, 0.009672752, -0.013014179, 0.10577515, 0.02908296));
	target1 += mul(ne1, float4x4(-0.0728776, -0.14159116, 0.105368264, -0.016262107, -0.14621304, -0.0007887494, 0.14413477, 0.11337385, -0.1769697, -0.1076886, 0.08036942, 0.10428512, 0.10336065, -0.15257628, 0.05553209, 0.12439473));
	target1 += mul(nf1, float4x4(-0.067323305, 0.23115864, 0.0817162, 0.13127932, 0.02427729, 0.01246805, 0.021550559, 0.066352196, -0.014213087, -0.022559473, 0.058270242, -0.069260366, -0.1949913, 0.27712336, -0.020843407, 0.16199547));
	target1 += mul(ng1, float4x4(-0.06066066, 0.009365795, -0.005817299, 0.016661849, 0.032292802, 0.10364246, -0.105340734, -0.040422246, 0.0028520338, 0.10786728, 0.041312158, 0.0634878, -0.10283239, -0.13716424, 0.2013461, -0.14106691));
	target1 += mul(nh1, float4x4(-0.14796652, 0.042259417, -0.08663438, 0.09733461, -0.044074174, 0.24739462, 0.04777009, -0.026686348, 0.0027458945, 0.043400105, -0.11496284, 0.08113486, -0.33933377, 0.046819236, -0.12803015, 0.006137677));
	target1 += mul(ni1, float4x4(-0.07903079, -0.009489394, 0.018812884, -0.031424083, 0.14344518, 0.08629371, 0.123602144, 0.045581687, 0.102321856, 0.07221763, 0.14465447, -0.23171869, -0.1145046, -0.088674895, -0.08679749, -0.20322132));
	target1 += mul(na2, float4x4(-0.09741677, 0.0010184142, -0.06932825, 0.044964395, 0.03060611, 0.11817057, 0.04148144, 0.000755089, 0.018646225, -0.1362759, 0.045627713, -0.01720389, -0.013920286, 0.0041473205, 0.023480741, -0.00036270308));
	target1 += mul(nb2, float4x4(-0.047821313, 0.15457056, 0.081069574, -0.061125267, -0.003727664, -0.03735384, -0.00673114, -0.0585745, -0.14427665, 0.21584798, 0.17612408, 0.03723236, 0.09688153, 0.0071055717, 0.0704578, -0.008490558));
	target1 += mul(nc2, float4x4(0.005648931, -0.021415008, 0.07515239, 0.024656001, 0.14356652, -0.09023091, -0.092833556, -0.11933706, -0.17543222, -0.31645912, -0.14794292, -0.10830711, 0.046658885, -0.13449514, -0.032724228, -0.07927336));
	target1 += mul(nd2, float4x4(-0.012330256, 0.030906612, 0.009849825, 0.16186711, 0.105316125, 0.1066287, 0.007410255, 0.08471377, -0.06755245, 0.2835302, 0.06922882, 0.18501134, -0.10781668, -0.021025939, -0.057754997, -0.19532007));
	target1 += mul(ne2, float4x4(0.09254016, 0.21572222, -0.250398, -0.017990865, 0.10726608, -0.13617107, 0.06726572, -0.17355372, 0.07552837, -0.01980061, 0.10523871, -0.062427603, -0.1769102, 0.35534126, -0.22155605, -0.13921477));
	target1 += mul(nf2, float4x4(0.0054315915, 0.028563919, -0.030617325, 0.12851912, 0.0020591016, -0.07287573, -0.15371658, -0.3468236, 0.042036943, -0.19993319, -0.1311562, -0.11087494, -0.033534657, -0.049439076, 0.07299748, 0.049393892));
	target1 += mul(ng2, float4x4(0.04817828, 0.009956909, 0.08608736, -0.04149299, 0.07101367, -0.03388178, 0.08030968, -0.032450564, 0.14994971, -0.006995002, 0.13461865, -0.061656967, -0.044900555, -0.05698395, 0.07130313, -0.17835349));
	target1 += mul(nh2, float4x4(0.09259944, -0.1760367, -0.05008204, 0.12799591, 0.10526596, 0.25768888, 0.11187724, -0.06537007, 0.11869906, -0.30243787, 0.1930932, -0.13290296, 0.017331708, 0.04682896, 0.02930385, 0.15250616));
	target1 += mul(ni2, float4x4(-0.01343636, -0.015147329, -0.12101166, 0.04787181, 0.088516094, -0.0716172, 0.012281597, -0.01175244, -0.036102388, -0.16996604, 0.0068835146, 0.16938321, -0.019361602, -0.07008898, -0.111906745, -0.008676077));
	target1 += float4(0.03128986, -0.070663765, -0.056307543, -0.043389197);

	float4 target2 = mul(a1, float4x4(-0.010251427, -0.045750465, 0.016315231, -0.008768869, 0.017431414, 0.080067836, 0.025827147, 0.10838066, 0.0024869177, -0.034495536, 0.09772538, 0.07213915, 0.016637174, 0.040788822, -0.022752339, 0.10970543));
	target2 += mul(b1, float4x4(0.11526194, 0.09676918, -0.04237834, -0.2271947, 0.12261753, -0.24500768, 0.10468346, 0.13780572, -0.009849901, 0.023189532, 0.0011982447, 0.04185303, 0.045187697, 0.06505389, 0.096869685, -0.1784324));
	target2 += mul(c1, float4x4(0.04672689, 0.13536161, -0.1818021, -0.20668268, 0.07533596, -0.032177944, -0.024819814, 0.036118865, 0.012960037, -0.04256549, 0.03154665, 0.10697645, 0.0455828, 0.15624708, 0.0880299, -0.044446476));
	target2 += mul(d1, float4x4(-0.03187084, -0.04798656, 0.05435525, -0.060023244, -0.02988392, -0.13252808, -0.13699181, -0.013882888, 0.052836955, -0.051288467, -0.048392758, -0.02818318, -0.045959223, -0.0385304, -0.113381095, 0.048340388));
	target2 += mul(e1, float4x4(0.06799445, -0.32721373, 0.09433875, -0.24025385, 0.0029125893, -0.029136823, 0.01100064, -0.12017942, -0.12278812, -0.0646935, 0.009398038, -0.021518359, 0.008572816, 0.15084247, -0.22798048, -0.027803216));
	target2 += mul(f1, float4x4(0.14571115, 0.24804439, 0.13177192, -0.1820655, -0.0030899157, -0.11837261, 0.14447895, 0.11825037, 0.083688706, 0.13209106, 0.051847935, -0.27009267, -0.030820336, 0.15591313, 0.00807933, 0.08577916));
	target2 += mul(g1, float4x4(0.07043623, -0.006127145, -0.16473344, -0.091646075, 0.12019198, 0.02408659, 0.038805984, 0.043282606, 0.09853516, -0.03085117, -0.13666795, 0.057578508, 0.023477113, -0.050639734, 0.05486259, 0.10117338));
	target2 += mul(h1, float4x4(0.07739963, 0.019718317, -0.17859067, -0.107660785, 0.07235146, -0.08198499, -0.13072458, 0.0808431, -0.09421921, -0.024668563, 0.058651946, 0.058679227, -0.041750733, 0.07785575, 0.0375434, -0.11090677));
	target2 += mul(i1, float4x4(0.13032761, 0.2291367, -0.1677081, -0.22246332, 0.03946319, 0.0063910848, 0.09128152, 0.0013804171, -0.034065075, -0.058277655, 0.052419346, -0.030012188, 0.018556409, -0.07521306, 0.12746032, 0.0899423));
	target2 += mul(a2, float4x4(-0.14820024, 0.03316697, 0.074021704, 0.0349015, -0.028731624, -0.03655249, 0.041885335, 0.025598902, -0.007544352, -0.058063164, 0.030487465, -0.073317364, -0.033130456, -0.17607957, 0.0020156964, 0.15351814));
	target2 += mul(b2, float4x4(-0.33111712, 0.11070417, -0.11759775, 0.12881225, -0.10840586, -0.114877716, 0.026571346, 0.01617625, 0.0028098845, 0.07325011, -0.008114658, 0.11581408, 0.0040087802, 0.15237121, 0.10423624, 0.010486565));
	target2 += mul(c2, float4x4(-0.14014785, 0.03670812, 0.041663505, -0.25026393, -0.05651376, -0.009220771, 0.18786587, 0.11221872, -0.0045316, -0.0781469, 0.09609792, -0.077175744, 0.15113525, 0.14979461, -0.003579166, -0.097722545));
	target2 += mul(d2, float4x4(0.005191016, -0.05746076, 0.14736177, -0.37837118, -0.116905205, 0.035447106, -0.1389216, -0.06583864, 0.08867301, -0.027591052, 0.020395119, -0.067704394, -0.078146204, 0.21156693, -0.24100207, -0.34081197));
	target2 += mul(e2, float4x4(0.3395633, -0.16366479, -0.16501908, 0.19205959, -0.1203106, 0.1201394, 0.059141878, 0.024588805, 0.0106182005, -0.007498128, -0.13781549, -0.031079333, 0.45373476, -0.019419974, -0.029461615, -0.109356895));
	target2 += mul(f2, float4x4(-0.20302778, 0.023634301, 0.0037064455, 0.23106048, -0.14157735, 0.115462445, -0.10275177, -0.05708588, 0.0066573587, -0.14406916, -0.029837208, 0.056612004, -0.036978997, 0.07784742, -0.009329581, 0.11628078));
	target2 += mul(g2, float4x4(-0.050052032, 0.061341796, -0.108812004, -0.27657855, 0.07106667, -0.062498234, 0.08073948, 0.18898413, -0.005880379, -0.031624768, 0.0334547, 0.10361753, -0.18414119, -0.070826136, 0.027453694, 0.022999335));
	target2 += mul(h2, float4x4(0.014818375, 0.17337285, 0.10936815, -0.030657725, -0.08041041, 0.022390872, 0.0053962595, 0.090021096, 0.05470518, 0.014654071, 0.06899392, -0.03431451, 0.05177294, -0.13493995, -0.055468578, -0.19131596));
	target2 += mul(i2, float4x4(0.08200318, -0.10802187, -0.075451784, 0.006642357, -0.041665014, -0.05528946, 0.1799087, -0.07113583, -0.016218789, -0.12353001, -0.034801062, 0.06995437, 0.013318846, -0.16708943, 0.17779571, 0.20705931));
	target2 += mul(na1, float4x4(0.10754426, -0.03437161, -0.089123115, -0.12592112, -0.09719291, 0.042339396, -0.02457928, -0.10472151, -0.031175358, -0.06077806, -0.025603233, 0.0030798917, 0.0302328, -0.011108347, -0.08815118, -0.11247357));
	target2 += mul(nb1, float4x4(-0.03634052, -0.0752815, -0.032257803, -0.020932812, -0.01030603, 0.05347118, -0.013455479, -0.1528448, 0.11631174, 0.017359301, 0.0053947037, -0.10187295, -0.034056764, -0.06371101, 0.10579902, 0.06297638));
	target2 += mul(nc1, float4x4(0.0026892002, -0.09832557, 0.07002896, 0.17336288, 0.017382741, 0.0868499, 0.024310237, 0.1024202, 0.016445315, -0.096997134, -0.05655256, -0.03888035, -0.23449722, 0.004868548, -0.046150357, 0.16268611));
	target2 += mul(nd1, float4x4(-0.08197917, 0.06499742, 0.044401966, 0.119590975, 0.17058893, 0.003096477, 0.073047325, -0.2325016, 0.20562899, 0.06886438, -0.10150125, -0.09421983, -0.026852611, 0.11638924, -0.2897435, 0.10056706));
	target2 += mul(ne1, float4x4(0.05599001, 0.20881969, 0.057560008, 0.03211348, 0.07353149, 0.10849278, -0.04358825, -0.07277266, 0.19414866, 0.084341206, -0.054937962, -0.19548011, -0.1875029, -0.13233592, 0.247698, 0.054934226));
	target2 += mul(nf1, float4x4(0.006909254, -0.043635696, -0.0420242, 0.0029297285, -0.011208758, 0.10583326, -0.039475866, -0.091568366, -0.11034183, -0.2710617, -0.15182555, 0.27160573, 0.029486256, -0.17993683, 0.10480137, -0.031949393));
	target2 += mul(ng1, float4x4(0.012359864, -0.024621721, -0.066488825, -0.041012418, 0.0008418082, -0.034133818, 0.1275645, -0.22584224, 0.04127642, 0.021086683, -0.055507325, 0.017740795, -0.10207868, -0.02459281, -0.16278388, 0.2084072));
	target2 += mul(nh1, float4x4(0.07907339, -0.08811312, -0.043821383, -0.12781687, -0.014701197, -0.08600121, -0.07344954, -0.06233793, 0.13561183, 0.17435691, -0.25248256, -0.18915577, 0.11731138, -0.076414265, 0.011668736, -0.24489906));
	target2 += mul(ni1, float4x4(0.015452916, -0.1093781, -0.031768844, -0.049816687, 0.087654404, 0.083113015, -0.11759004, -0.02852037, 0.0119902035, -0.12981133, -0.043321397, 0.30873615, 0.16349368, 0.0475539, -0.12394514, 0.012860273));
	target2 += mul(na2, float4x4(0.024975974, 0.14167881, -0.03849521, 0.092395015, -0.14491238, -0.024630755, 0.1262065, 0.22724074, -0.088403955, 0.069909796, -0.1582284, -0.06366643, 0.03808985, 0.055002328, 0.046191234, -0.15073699));
	target2 += mul(nb2, float4x4(0.040616892, -0.05149903, 0.07913543, -0.12622666, 0.012306014, -0.0072504813, 0.09324519, 0.013837971, 0.033986375, 0.09466625, -0.11271816, 0.06514161, 0.008318977, 0.2319992, -0.23813216, -0.064383216));
	target2 += mul(nc2, float4x4(0.0058016274, 0.07342614, -0.02532061, 0.046294674, -0.14704724, -0.09635743, 0.011660911, -0.028665043, 0.07488793, 0.049912058, -0.23186599, -0.12174707, -0.078130014, -0.17273565, 0.009148666, 0.042669322));
	target2 += mul(nd2, float4x4(0.02457923, 0.06036786, -0.08706319, 0.011597113, 0.0027447701, 0.12410346, 0.07509643, 0.23769653, 0.055913534, -0.030516708, 0.090205066, 0.005610863, -0.0037265806, -0.06458783, 0.08390646, 0.03704848));
	target2 += mul(ne2, float4x4(-0.24644387, 0.09733959, 0.15941189, -0.039000493, -0.34143484, -0.10905996, 0.123846896, -0.025850125, 0.22231472, -0.074195, 0.17869541, 0.007901206, -0.07893139, -0.0031443893, -0.2252749, 0.020515904));
	target2 += mul(nf2, float4x4(0.046822242, 0.19209228, 0.10584968, -0.20782734, 0.020917192, 0.064485386, 0.022432446, 0.0021164739, 0.053817958, 0.2291973, 0.15079306, -0.18283905, 0.090974085, 0.24965459, -0.11586238, -0.1068585));
	target2 += mul(ng2, float4x4(-0.018472567, -0.09019175, -0.0014198436, 0.11438912, -0.18806975, 0.017498987, 0.06471353, -0.11078878, -0.09412236, -0.11218875, 0.077031404, -0.18779173, -0.025784107, -0.031477705, -0.10906885, 0.074243516));
	target2 += mul(nh2, float4x4(-0.06388332, 0.0813248, 0.1583895, -0.17604364, 0.02474024, 0.09227594, -0.07166613, -0.046409506, -0.20977338, 0.058364637, -0.014288648, 0.23180534, -0.03359222, 0.03962627, -0.011652336, 0.08433068));
	target2 += mul(ni2, float4x4(-0.05829235, -0.026256828, 0.051615473, -0.082805336, 0.06738748, -0.093329325, -0.03197624, 0.067339435, -0.06104219, 0.119381785, 0.10763423, -0.31583574, 0.003745323, 0.14953502, -0.009772352, -0.05511591));
	target2 += float4(-0.014193535, -0.035853464, -0.0019574068, 0.035060503);

	tex3[gxy] = target1;
	tex4[gxy] = target2;
}


//!PASS 5
//!DESC Conv-3x3x3x16
//!IN INPUT, tex3, tex4
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass5(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (!CheckViewport(gxy)) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	float4 a1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	float4 b1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 c1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 d1 = tex3.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 e1 = tex3.SampleLevel(sam, pos, 0);
	float4 f1 = tex3.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 g1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 h1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 i1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	float4 a2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	float4 b2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 c2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 d2 = tex4.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 e2 = tex4.SampleLevel(sam, pos, 0);
	float4 f2 = tex4.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 g2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 h2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 i2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	float4 result = mul(max(a1, 0), float4x4(-0.01858372, 0.017144108, 0.02794388, 0.0, 0.0129101565, -0.0073674284, -0.011766938, 0.0, 0.01970984, 0.01209068, 0.009530311, 0.0, -0.009190449, -0.006996753, -0.0038750458, 0.0));
	result += mul(max(b1, 0), float4x4(0.15856947, 0.10162126, 0.08489005, 0.0, 0.038381726, -0.017771017, -0.03226132, 0.0, -0.011787879, -0.0152445, -0.007564454, 0.0, 0.055921376, 0.08389841, 0.08452836, 0.0));
	result += mul(max(c1, 0), float4x4(0.026705442, -0.0070655374, -0.018199183, 0.0, 0.016254421, -0.025398912, -0.03461042, 0.0, 0.03950644, 0.06586101, 0.0707467, 0.0, -0.03793455, -0.04957139, -0.04777402, 0.0));
	result += mul(max(d1, 0), float4x4(-0.115341224, -0.04463122, -0.016549354, 0.0, -0.059433736, -0.04303295, -0.042805545, 0.0, 0.010830498, -0.011057443, -0.0141014, 0.0, 0.067396216, 0.06553637, 0.06705378, 0.0));
	result += mul(max(e1, 0), float4x4(-0.12767975, -0.19935511, -0.20109995, 0.0, 0.11554901, 0.11426503, 0.11161185, 0.0, -0.22092125, -0.22041021, -0.2142712, 0.0, -0.06326996, -0.061314825, -0.059039716, 0.0));
	result += mul(max(f1, 0), float4x4(0.007717391, -0.046238754, -0.056983955, 0.0, 0.021419598, 0.0036924274, -0.00033630748, 0.0, 0.053556852, 0.0824714, 0.08295022, 0.0, -0.09881205, -0.043157153, -0.040801782, 0.0));
	result += mul(max(g1, 0), float4x4(0.0052828738, 0.049702674, 0.056108, 0.0, 0.009478552, 0.010345037, 0.0094180945, 0.0, -0.010412882, 0.0006965096, 0.0021917222, 0.0, -0.010701383, -0.023212843, -0.024252625, 0.0));
	result += mul(max(h1, 0), float4x4(0.07542127, 0.0739301, 0.06642962, 0.0, -0.08054489, -0.037553925, -0.026762033, 0.0, 0.09727509, 0.102272816, 0.097533874, 0.0, 0.01325714, -0.004582272, -0.006647532, 0.0));
	result += mul(max(i1, 0), float4x4(0.03005975, 0.017012767, 0.007840201, 0.0, -0.028650383, -0.0019064787, 0.01083078, 0.0, -0.071352504, -0.019919744, -0.008299795, 0.0, 0.023253804, 0.042413715, 0.04681489, 0.0));
	result += mul(max(a2, 0), float4x4(-0.052201163, -0.021727808, -0.020888992, 0.0, 0.008365179, -0.016546093, -0.0111018475, 0.0, -0.06236095, -0.019278256, -0.021443967, 0.0, 0.0029381379, -0.0033039588, -0.006425339, 0.0));
	result += mul(max(b2, 0), float4x4(0.02397296, -0.041659098, -0.050882675, 0.0, -0.013487, 0.0067506596, 0.005435185, 0.0, 0.066447854, 0.13331215, 0.13754861, 0.0, 0.028300207, -0.0048033795, -0.010058485, 0.0));
	result += mul(max(c2, 0), float4x4(0.08140248, 0.018564016, 0.0036607496, 0.0, -0.0112075955, 0.0022339798, 0.0045722146, 0.0, -0.045716517, -0.0076076477, -0.0016939791, 0.0, -0.030486025, -0.07539711, -0.07185734, 0.0));
	result += mul(max(d2, 0), float4x4(-0.0155724995, 0.048904862, 0.059412133, 0.0, -0.013894624, -0.0061430936, -0.011662488, 0.0, -0.0052947477, -0.0176474, -0.018611705, 0.0, 0.022075793, 0.031703226, 0.026735537, 0.0));
	result += mul(max(e2, 0), float4x4(-0.18287502, -0.18703277, -0.18331653, 0.0, -0.08616293, -0.011741755, -0.009296464, 0.0, -0.054274965, 0.016794622, 0.022522328, 0.0, 0.06965258, 0.08260611, 0.08285337, 0.0));
	result += mul(max(f2, 0), float4x4(0.08107809, 0.0336241, 0.025449684, 0.0, -0.031931, 0.01179566, 0.019694995, 0.0, 0.025930194, 0.042288166, 0.04673656, 0.0, -0.14357394, -0.11003491, -0.094090074, 0.0));
	result += mul(max(g2, 0), float4x4(0.007188181, 0.050626095, 0.050705966, 0.0, -0.008030409, -0.018670242, -0.019766346, 0.0, 0.014874803, -0.03657919, -0.034044486, 0.0, -0.011178416, -0.004358302, -0.013611815, 0.0));
	result += mul(max(h2, 0), float4x4(0.07987872, 0.11399873, 0.12089382, 0.0, -0.01514355, 0.0068139364, 0.010206274, 0.0, -0.0005701044, -0.011158322, 0.006484812, 0.0, 0.002018227, 0.043359682, 0.042987905, 0.0));
	result += mul(max(i2, 0), float4x4(0.0017806455, -0.0015697709, -0.0018252691, 0.0, 0.0058658062, 0.021681193, 0.028615465, 0.0, -0.054827355, -0.04541651, -0.027485048, 0.0, -0.017649114, 0.017717479, 0.027309911, 0.0));
	result += mul(max(-a1, 0), float4x4(0.02555098, -0.0028983613, -0.005134733, 0.0, -0.0029332284, 0.015552135, 0.022189403, 0.0, -0.019786593, -0.0031676649, -0.0014604586, 0.0, 0.06648065, 0.0672302, 0.04586375, 0.0));
	result += mul(max(-b1, 0), float4x4(-0.06674696, 0.002328631, 0.014039355, 0.0, -0.03636718, 0.014560653, 0.028076636, 0.0, 0.042305287, 0.015249338, 0.0136925895, 0.0, 0.033586804, 0.00701501, -0.011588751, 0.0));
	result += mul(max(-c1, 0), float4x4(-0.039022632, 0.015240631, 0.02699061, 0.0, -0.02614261, 0.0051843156, 0.012590042, 0.0, 0.015304643, -0.022641543, -0.030434309, 0.0, 0.016862666, 0.020819275, 0.022333218, 0.0));
	result += mul(max(-d1, 0), float4x4(0.08056982, 0.026592938, 0.009744146, 0.0, 0.08762212, 0.10150359, 0.09662005, 0.0, -0.044551965, -0.016349116, -0.014629014, 0.0, -0.014341297, -0.030914815, -0.038747486, 0.0));
	result += mul(max(-e1, 0), float4x4(-0.048734166, 0.019775594, 0.03124684, 0.0, -0.2345022, -0.23639877, -0.22958128, 0.0, 0.12412277, 0.10245112, 0.10389806, 0.0, -0.0030797734, -0.01989389, -0.02020691, 0.0));
	result += mul(max(-f1, 0), float4x4(-0.0133485105, 0.029644802, 0.041630358, 0.0, 0.041081797, 0.059993293, 0.060033485, 0.0, -0.02155099, -0.035306025, -0.03838472, 0.0, 0.017466968, -0.01866363, -0.004764589, 0.0));
	result += mul(max(-g1, 0), float4x4(0.0030783121, -0.04064586, -0.04504904, 0.0, -0.023528632, -0.029308239, -0.022441925, 0.0, 0.020095564, 0.018979732, 0.015117934, 0.0, 0.008429918, 0.021180628, 0.020137152, 0.0));
	result += mul(max(-h1, 0), float4x4(0.0012200709, 0.013313984, 0.014122978, 0.0, 0.08750284, 0.038747437, 0.027102578, 0.0, -0.09627132, -0.09706183, -0.09405641, 0.0, -0.05180081, -0.03555434, -0.021694236, 0.0));
	result += mul(max(-i1, 0), float4x4(-0.022396728, -0.018316073, -0.01250564, 0.0, 0.045423746, 0.025315331, 0.010639915, 0.0, 0.05618814, 0.022210265, 0.014195103, 0.0, -0.014828652, -0.010245087, 0.0020570823, 0.0));
	result += mul(max(-a2, 0), float4x4(0.046651457, 0.001333767, -0.003572458, 0.0, -0.0077845114, -0.012861641, -0.015116351, 0.0, 0.01338984, 0.029198132, 0.026183384, 0.0, 0.0014878022, 0.020025207, 0.024829973, 0.0));
	result += mul(max(-b2, 0), float4x4(-0.09506711, -0.06541528, -0.051106647, 0.0, 0.02552611, 0.01181497, 0.0020236392, 0.0, 0.03234602, -0.03153924, -0.035502207, 0.0, -0.034516744, 0.00018784113, 0.0085376045, 0.0));
	result += mul(max(-c2, 0), float4x4(-0.05945615, -0.0046793907, 0.011128929, 0.0, -0.0061961384, -0.0040663416, -0.010319631, 0.0, 0.044197917, -0.033448357, -0.04109943, 0.0, -0.04109929, 0.006773195, 0.016976412, 0.0));
	result += mul(max(-d2, 0), float4x4(0.02855516, -0.033051047, -0.04864978, 0.0, -0.06393814, -0.082921155, -0.0730681, 0.0, -0.058905125, -0.038639963, -0.027698845, 0.0, -0.013616608, -0.007876684, -0.006182652, 0.0));
	result += mul(max(-e2, 0), float4x4(0.15423118, 0.14667909, 0.14534634, 0.0, 0.1485341, 0.096721016, 0.0820024, 0.0, 0.1263968, 0.088775866, 0.083860956, 0.0, 0.04213644, 0.020989005, 0.010447147, 0.0));
	result += mul(max(-f2, 0), float4x4(-0.068275765, -0.018390667, -0.011452603, 0.0, 0.03738383, 0.019398715, 0.005998161, 0.0, -0.0011161854, -0.039955888, -0.04444185, 0.0, 0.052985556, 0.017621813, 0.009551621, 0.0));
	result += mul(max(-g2, 0), float4x4(0.01387326, -0.0033411914, -0.009420935, 0.0, -0.034494568, -0.019219222, -0.009562797, 0.0, 0.0074023325, 0.022065453, 0.027121471, 0.0, 0.00019609048, -0.0042242454, 2.0403608e-05, 0.0));
	result += mul(max(-h2, 0), float4x4(-0.015793918, -0.024342488, -0.037188973, 0.0, 0.004534637, -0.025236975, -0.028567247, 0.0, -0.055682972, -0.054670315, -0.06584981, 0.0, 0.043045517, -0.0075941198, -0.014196169, 0.0));
	result += mul(max(-i2, 0), float4x4(0.0132598495, 0.01775289, 0.017206183, 0.0, 0.010604703, -0.007352816, -0.017301153, 0.0, 0.030967329, 0.027615465, 0.0145311365, 0.0, 0.008636854, -0.033379406, -0.042725433, 0.0));
	result += float4(-0.0056639817, -0.0017339308, -0.0011913306, 0.0);

	result += INPUT.SampleLevel(sam, pos, 0);

	WriteToOutput(gxy, result.rgb);
}
